library(data.table); library(magrittr)
library(ggplot2); theme_set(theme_bw(base_size = 14))
library(patchwork)
library(SingleCellExperiment)
library(pcaExplorer); library(org.Mm.eg.db)
source("src/marker_gene_extract_and_plot.R")
source("src/load_data_from_box.R")
library(pheatmap)
library(org.Mm.eg.db); library(clusterProfiler)

Integrating data sets

  • Yao et al (2019): day 7 CD8 T cells from acute (Arm) and chronic (Cl13) infection
    • will focus on the memory precursors and progenitor-like cells (which are from repl. 1 only)
  • Schauder et al (2021): day 129 following acute infection

For details of how we retrieved the public data sets, see Schauder2021.Rmd and Yao2019.Rmd.

## Code here does not need to be executed, it's simply for documentation purposes
## to show the details of how we generated the composite integrated file that
## we're providing for download
library(magrittr)
library(batchelor)
library(scran)
library(scater)
library(patchwork)

fnall <- "sce_integrated_with_YaoProgsMpecs-and-Schauder.rds"
fnmerged <- "sceMultiout_integrated_with_YaoProgsMpecs-and-Schauder.rds"

## MAKE A LIST of INDIVIDUAL SCEs ==============================================
## our data -------------
sln <- load_data_from_Box("https://wcm.box.com/shared/static/2wvbdfs3vja2cnlckcqpk20eu1693o8o.rds") 
rownames(sln) <- rowData(sln)$ID
scel <- lapply(unique(sln$Sample), function(x){
    out.sce <- sln[, sln$Sample == x]
    assay(out.sce, "logcounts") <- NULL
    colData(out.sce) <- colData(out.sce)[, c("Sample","Barcode","cell")]
    rownames(out.sce)  <- rowData(out.sce)$ID
    return(out.sce)
})
names(scel) <- unique(sln$Sample)

## Schauder -------------
scs <- readRDS("../2021-08_Schauder2021/sce_integrated_with_Schauder2021.rds")
assay(scs, "logcounts") <- NULL
scs <- scs[,!grepl("^pLN",scs$Sample)]
colData(scs) <- colData(scs)[, c("Sample","Barcode","cell")]
rownames(scs)  <- rowData(scs)$ID
scel$Schauder <- scs

## Yao -------------
scy <- readRDS("../2021-08_Yao2019/sce_integrated_with_Yao2019.rds")
assay(scy, "logcounts") <- NULL
scy <- scy[,!grepl("^pLN",scy$Sample)]

yaolabs <- read.table("../2021-08_Yao2019/Yao2019_metadata.txt", header = FALSE, skip = 1)
names(yaolabs) <-  c("YaoCell","nGene","nUMI","orig.ident","percent.mito","res.0.5","res.1","res.1.5")
## res.0.5 = clusters; 10 = progs, 7 = MPecs
kp_cells <- subset(yaolabs, res.0.5 %in% c(10, 7)) %>% .$YaoCell
scy$YaoCell <- paste(gsub("_[12]$", "", scy$Sample), gsub("-[0-9]$","", scy$Barcode), sep = "_")

for(i in unique(scy$Sample)){
    out.sce <- scy[, scy$Sample == i]
    out.sce <- out.sce[, out.sce$YaoCell %in% kp_cells]
    colData(out.sce) <- colData(out.sce)[, c("Sample","Barcode","cell")]
    rownames(out.sce)  <- rowData(out.sce)$ID
    scel[[i]] <- out.sce}

## PREPPING ============================================================
rd_qc <- lapply(scel,perFeatureQCMetrics)
for(x in seq_along(scel)){rowData(scel[[x]])$qc.mean <- rd_qc[[x]]$mean}
scel2 <- lapply(scel, function(x){ x[rowData(x)$qc.mean > 0.001,]})
## combine
universe <- Reduce(intersect, lapply(scel2, rownames))
scel2 <- lapply(scel2, "[", i=universe)
# generate logcounts
normed.sce <- do.call(multiBatchNorm, scel2) # returns a list
# Identifying a set of HVGs using stats from all batches, using logcounts
all.dec <- lapply(normed.sce, modelGeneVar)
combined.dec <- do.call(combineVar, all.dec)
combined.hvg <- getTopHVGs(combined.dec, n=2000)

## MERGING with MNN ====================================================
## prep
combined <- noCorrect(normed.sce)
assayNames(combined) <- "logcounts"
combined$Sample <- combined$batch
combined$batch <- NULL
set.seed(1010100)
## progressively merge cells from each sample in each batch until all cells 
## are mapped onto a common coordinate space
multiout <- fastMNN(combined, batch=combined$Sample, subset.row=combined.hvg)
# Renaming metadata fields for easier communication later.
multiout$Sample <- multiout$batch

## UMAP----------------------------------
set.seed(10101010)
multiout <- runUMAP(multiout, dimred="corrected")

## Clustering -----------------------------
g <- buildSNNGraph(multiout, use.dimred="corrected", k = 20)
clusters <- igraph::cluster_louvain(g)
multiout$cluster_with_SchauderYao_k20 <- factor(clusters$membership)

g <- buildSNNGraph(multiout, use.dimred="corrected", k = 50)
clusters <- igraph::cluster_louvain(g)
multiout$cluster_with_SchauderYao_k50 <- factor(clusters$membership)

g <- buildSNNGraph(multiout, use.dimred="corrected", k = 10)
clusters <- igraph::cluster_louvain(g)
multiout$cluster_with_SchauderYao_k10 <- factor(clusters$membership)

g <- buildSNNGraph(multiout, use.dimred="corrected", k = 5)
clusters <- igraph::cluster_louvain(g)
multiout$cluster_with_SchauderYao_k5 <- factor(clusters$membership)

saveRDS(multiout,file = fnmerged)

# generate composite file for the combined file of all shared genes ============
## combine
universe <- Reduce(intersect, lapply(scel, rownames))
scel <- lapply(scel, "[", i=universe)
comb.mat <- lapply(scel, function(x) counts(x)) %>% do.call(cbind, .)
colnames(comb.mat) <- unlist(lapply(scel, function(x) colnames(x)))

### rowData
rd <- rowData(scel[[1]])[, c("ID","Symbol")]
rd <- rd[rownames(comb.mat),]

## colData 
cd <- lapply(scel, function(x) colData(x)[, c("Sample","Barcode","cell")]) %>% do.call(rbind, .)
cd <- cd[colnames(comb.mat),]

scAll <- SingleCellExperiment(
    assays = list(counts = comb.mat), 
    colData = cd, rowData = rd)

## add redDims from the merged data set
rdu <- reducedDim(multiout, "UMAP") 
reducedDim(scAll, "UMAP") <- rdu[colnames(scAll),]
reducedDim(scAll, "PCA_corr") <- reducedDim(multiout, "corrected")

## add log-counts
qckclst <- quickCluster(scAll, method = "igraph", min.mean = 0.1)
scAll <- computeSumFactors(scAll, min.mean=0.1, cluster = qckclst)
scAll <- scater::logNormCounts(scAll)

saveRDS(scAll, file = fnall)
colData(multiout) %>% saveRDS("colData_multiout_YaoProgsMpecs-and-Schauder.rds")

## fix ColData ======================================================
cd2 <- colData(multiout)
cd2 <- as.data.frame(cd2)
cd2$cell <- rownames(cd2)
cd2 <- cd2[, c("cell",grep("cluster", names(cd2), value=TRUE))]
newcd <- merge(colData(scAll), cd2, by = "cell") 

## add YAO LABELS -----------------------------------------------
newcd$YaoCell <- paste(gsub("_[12]$", "", newcd$Sample), gsub("-[0-9]$","", newcd$Barcode), sep = "_")
newcd <- merge(newcd, yaolabs, by="YaoCell", all.x = TRUE)

## add OUR LABELS ------------------------------------------------
cdus <- colData(sln)
newcd <- cdus[, c("cell","CD62L","label")] %>% as.data.frame %>% 
    merge(newcd, ., on = "cell", all.x = TRUE)
newcd$label <- ifelse(!is.na(newcd$label), newcd$label,
    ifelse(grepl("GSM3732587", newcd$Sample), "Schauder",
        ifelse(grepl("^D", newcd$Sample),  paste0("Yao_", gsub("_1$", "", gsub("_P14","",newcd$Sample))),
            "no.clue")))
## 10 = progs, 7 = MPecs
newcd$label2 <- ifelse(is.na(newcd$res.0.5), newcd$label,
    ifelse(newcd$res.0.5 == 10, "ProgLike", 
        ifelse(newcd$res.0.5 == 7, paste(newcd$label, "MPECS", sep = "_"), "no.clue")))
newcd$label2 <- ifelse(newcd$label2 == "Schauder", "Tcm", newcd$label2)

#> table(newcd$label2)
#             AIC               AMC          ProgLike               Tcm 
#             1704              1335               114               535 
#       transition  Yao_D7_Arm_MPECS Yao_D7_Cl13_MPECS 
#             2016               386                 2 


## finalize colData
rownames(newcd) <- newcd$cell
colData(scAll) <- newcd[colnames(scAll),]

## adjust the naming scheme for our populations
scAll$label <- ifelse(scAll$label == "AIC","AP", ifelse(scAll$label == "transition", "intermediate", ifelse(scAll$label == "AMC", "AM", scAll$label)))
scAll$label2 <- ifelse(scAll$label2 == "AIC","AP", ifelse(scAll$label2 == "transition", "intermediate", ifelse(scAll$label2 == "AMC", "AM", scAll$label2)))
## remove the lonely two cells classified as D7-Cl13-MPECS
scAll <- scAll[, scAll$label2 != "Yao_D7_Cl13_MPECS"]
scAll$label2 <- gsub("Yao_D7_Arm_", "", scAll$label2)
scAll$label2 <- factor(scAll$label2, levels = c("AP","intermediate","AM","MPECS","ProgLike", "Tcm"), ordered = TRUE)
scAll$Ref <- ifelse(grepl("Yao",scAll$label), "Yao", ifelse(grepl("Schauder", scAll$label), "Schauder", "Gearty"))


rownames(scAll) <- scater::uniquifyFeatureNames(ID = rowData(scAll)$ID, 
    names = rowData(scAll)$Symbol)

saveRDS(scAll, file = fnall)
sc3 <- load_RDSdata_from_Box(shared_link = "https://wcm.box.com/shared/static/ixtkzzow5b4r2ck3u6zit2jtep7dunzv.rds")
## Cached data here: ~/Library/Caches/BiocFileCache/d3a0698a2fae_ixtkzzow5b4r2ck3u6zit2jtep7dunzv.rds
## define color schemes ---------------------------------------------------------
popcols <- c("#1E498F", "#629DD4","#5E3DD8", # dark blue, light blue, intermediate blue
    "#228B22", #"bisque1",
    "#9ED9B9",#"lightpink1", 
    "maroon1") 
names(popcols) <- c("AP","intermediate","AM", 
    "Tcm",
    "MPECS","ProgLike")

sample_cols <- c(
    "D7_P14_Cl13_1" = "sienna4", "D7_P14_Arm_1"="sienna2",
    "GSM3732587_Day129"="wheat3",
    "pLN_1"="blue","pLN_2"="dodgerblue1","pLN_3"="lightskyblue")


cluster_cols <- c('#B1E2F9','limegreen','grey30','#FFC914','#0066E2','#FC71E9','#00E5CA','#E31A1C','grey83','#FF7F00','#378C07','#6A3D9A','#FFFF99','#B15928')
names(cluster_cols) <- sort(unique(sc3$cluster_with_SchauderYao_k20))
cluster_cols <- cluster_cols[sort(unique(sc3$cluster_with_SchauderYao_k20))]

Global comparisons

We can pretend that the individual cell types of interest are pseudo-bulk samples, i.e. we can sum up the reads across all cells of the same label-sample combination (e.g. all cells from Sample “pLN_1” that are labelled as “AP” will form one pseudo-bulk sample “pLN_1_AP”). This helps us to apply more robust bulk-RNA-seq methods for global comparisons across the samples.

summed.scf <- scater::aggregateAcrossCells(sc3,
    ids=colData(sc3)[,c("label2","Sample")])
library(edgeR)
y <- DGEList(counts(summed.scf), samples=colData(summed.scf))
keep <- filterByExpr(y, group=summed.scf$Sample)
y <- y[keep,]
y <- calcNormFactors(y)

PCA

PCA <- prcomp(t(cpm(y, log = TRUE)), scale = F)
percentVar <- round(100*PCA$sdev^2/sum(PCA$sdev^2),1)
dataGG = data.frame(PC1 = PCA$x[,1],PC2 = PCA$x[,2], 
    PC3 = PCA$x[,3],PC4 = PCA$x[,4],
    sampleName = row.names(y$samples),
    y$samples)
dataGG$MySampleName <- paste(dataGG$Sample.1, dataGG$label2.1, sep = "_")

p1 <- ggplot(dataGG, aes(x = PC1, y = PC2, fill = label2.1, shape = Ref)) +
    geom_point(size = 8,  alpha = .85) +
    xlab(paste0("PC1, VarExp:", round(percentVar[1],4))) +
    ylab(paste0("PC2, VarExp:", round(percentVar[2],4))) +
    scale_fill_manual(values = popcols, name = "") + 
    scale_shape_manual(values = c(21,22,23), name = "") +
   theme(legend.position = "bottom")

p2 <- ggplot(dataGG, aes(x = PC1, y = PC2, color = label2.1, label = MySampleName)) +
    geom_text(size = 6) + scale_color_manual(values = popcols) +
    theme(legend.position = "none")

p1 | p2

par(mfrow=c(2,1))
hi_loadings(PCA,whichpc = as.integer(1),topN = 20)
hi_loadings(PCA,whichpc = as.integer(2),topN = 20)

hc <- hclust( as.dist(1-cor(cpm(y, log = TRUE))), method="complete")
hc$labels <- dataGG$MySampleName
#plot(hc)
library(dendextend)
dend <- as.dendrogram(hc)
labels_colors(dend) <- c(rep(popcols[["AP"]], 3), popcols[["AM"]], rep(popcols[["intermediate"]], 3), rep(popcols[["AM"]], 2), popcols[["Tcm"]], popcols[["MPECS"]], popcols[["ProgLike"]])

par(mar=c(15,4,4,2))
rotate(dend, c( 'pLN_1_AP', 'pLN_3_AP', 'pLN_4_AP', 'pLN_1_intermediate', 'pLN_3_intermediate', 'pLN_4_intermediate', 'pLN_1_AM', 'pLN_3_AM', 'pLN_4_AM', 'D7_P14_Arm_1_MPECS', 'D7_P14_Cl13_1_ProgLike','GSM3732587_Day129_Tcm')) %>% plot

Single-cell level: markers distinguishing the populations

comparing AP vs ProgLike vs Tcm and so on

Taking all genes that are either up- or down in any one of the populations (FDR <= 1%) and that are at least in the top 50 for each population.

LabelMarkers <- scran::findMarkers(sc3,
    group = sc3$label2)

goi <- lapply(LabelMarkers, function(x) rownames(subset(as.data.frame(x), FDR <= 0.01 & Top <= 50))) %>% unlist %>% unique
## with Tcf7 as a color bar 
plot_marker_heatmap(sc3, 
    gns=unique(c("Tox",goi)), 
    gns_as_cellAnnotation = c("Tcf7","Tox"), exclude_genes = "Tox",
    exprs_values = "logcounts",
    show_HKgenes = "both",
    n_quant_breaks = 300,
   scale="row",col_palette = c("mediumorchid","black","yellow"),
    fontsize_row = 7,
  add_cell_annotation = data.frame(
        row.names = colnames(sc3),
        #cluster = sc3$cluster_with_SchauderYao_k20,
        population = sc3$label2),
    define_anno_cols = list(
        #cluster = cluster_cols,
        population = popcols,
        Tox = scales::brewer_pal("seq", "Reds")(5)[1:4],
        Tcf7 = scales::brewer_pal("seq", "Reds")(5)[1:4]),
    main = "Marker genes of the individual populations\n(FDR 1%, Rank <= 50)"
    )

GO term enrichments

Determining the top enriched GO terms for markers that are specifically overexpressed in the different populations.

## marker genes
cm.up <- scran::findMarkers(sc3,
    group = sc3$label2, direction = "up")
mks.up <- extract_markers(sc3, 
    marker_search_result = cm.up, FDR_thresh = 0.01,
    rank_thresh = 300)
#mks.up <- mks.up[ !grepl("^Rp[ls]", gene_symbol)]

## get ENTREZ IDs
all.entrez <-  clusterProfiler::bitr(rownames(sc3), 
    fromType="SYMBOL", toType="ENTREZID",
    OrgDb="org.Mm.eg.db") %>%  as.data.table
#all.entrez_noRibo <- all.entrez[!grepl("^Rp[ls]", SYMBOL)]
setnames(all.entrez, names(all.entrez), c("gene_symbol", "entrez"))

## generate list of ENTREZ IDs of interest
eg <- all.entrez[mks.up, on = "gene_symbol"]
## focus on those that are UP and UNIQUELY so per population
clstcomp.list <- lapply(names(cm.up), function(x) eg[up_in == x & classify == "unique"]$entrez )
names(clstcomp.list) <- names(cm.up)
cc.gobp <- compareCluster(clstcomp.list, fun = "enrichGO", OrgDb = org.Mm.eg.db,
    ont = "BP", pvalueCutoff=0.05, readable=TRUE, universe = all.entrez$entrez)
## visualization
dotplot(cc.gobp, showCategory = 10) +
    ggtitle("Overrepresented GO BP terms for markers that are\nuniquely upregulated in either population") + 
     scale_color_gradientn(colours =rev(c("azure2", "darkseagreen1", "seagreen2","forestgreen")), limits = c(0, 0.05))

  • geneRatio should be number of genes that overlap gene set divided by size of gene set

Here are some of the top enriched GO terms in detail:

cnetplot(cc.gobp) + scale_fill_manual(values = popcols, name = "")

Session Info

sessionInfo()
## R version 4.0.2 (2020-06-22)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Catalina 10.15.7
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] dendextend_1.15.1           edgeR_3.30.3               
##  [3] limma_3.44.3                BiocFileCache_1.12.1       
##  [5] dbplyr_2.0.0                clusterProfiler_3.16.1     
##  [7] pheatmap_1.0.12             org.Mm.eg.db_3.11.4        
##  [9] AnnotationDbi_1.50.3        pcaExplorer_2.14.2         
## [11] SingleCellExperiment_1.10.1 SummarizedExperiment_1.18.2
## [13] DelayedArray_0.14.1         matrixStats_0.57.0         
## [15] Biobase_2.48.0              GenomicRanges_1.40.0       
## [17] GenomeInfoDb_1.24.2         IRanges_2.22.2             
## [19] S4Vectors_0.26.1            BiocGenerics_0.34.0        
## [21] patchwork_1.1.1             ggplot2_3.3.3              
## [23] magrittr_2.0.1              data.table_1.13.6          
## 
## loaded via a namespace (and not attached):
##   [1] shinydashboard_0.7.1      tidyselect_1.1.1         
##   [3] heatmaply_1.2.1           RSQLite_2.2.2            
##   [5] htmlwidgets_1.5.3         grid_4.0.2               
##   [7] TSP_1.1-10                BiocParallel_1.22.0      
##   [9] scatterpie_0.1.5          munsell_0.5.0            
##  [11] codetools_0.2-18          statmod_1.4.35           
##  [13] scran_1.16.0              DT_0.17                  
##  [15] withr_2.3.0               colorspace_2.0-0         
##  [17] GOSemSim_2.14.2           Category_2.54.0          
##  [19] knitr_1.30                DOSE_3.14.0              
##  [21] NMF_0.23.0                labeling_0.4.2           
##  [23] urltools_1.7.3            GenomeInfoDbData_1.2.3   
##  [25] polyclip_1.10-0           topGO_2.40.0             
##  [27] bit64_4.0.5               farver_2.0.3             
##  [29] downloader_0.4            vctrs_0.3.8              
##  [31] generics_0.1.0            xfun_0.20                
##  [33] R6_2.5.0                  doParallel_1.0.16        
##  [35] ggbeeswarm_0.6.0          graphlayouts_0.7.1       
##  [37] rsvd_1.0.3                seriation_1.3.0          
##  [39] locfit_1.5-9.4            bitops_1.0-6             
##  [41] shinyAce_0.4.1            fgsea_1.14.0             
##  [43] gridGraphics_0.5-1        assertthat_0.2.1         
##  [45] promises_1.1.1            scales_1.1.1             
##  [47] ggraph_2.0.4              enrichplot_1.8.1         
##  [49] beeswarm_0.2.3            gtable_0.3.0             
##  [51] drat_0.1.8                tidygraph_1.2.0          
##  [53] rlang_0.4.11              genefilter_1.70.0        
##  [55] splines_4.0.2             lazyeval_0.2.2           
##  [57] shinyBS_0.61              europepmc_0.4            
##  [59] BiocManager_1.30.10       yaml_2.2.1               
##  [61] reshape2_1.4.4            threejs_0.3.3            
##  [63] crosstalk_1.1.1           httpuv_1.5.4             
##  [65] qvalue_2.20.0             RBGL_1.64.0              
##  [67] tools_4.0.2               ggplotify_0.0.5          
##  [69] gridBase_0.4-7            ellipsis_0.3.2           
##  [71] RColorBrewer_1.1-2        ggridges_0.5.3           
##  [73] Rcpp_1.0.6                plyr_1.8.6               
##  [75] base64enc_0.1-3           progress_1.2.2           
##  [77] zlibbioc_1.34.0           purrr_0.3.4              
##  [79] RCurl_1.98-1.2            prettyunits_1.1.1        
##  [81] openssl_1.4.3             viridis_0.5.1            
##  [83] cowplot_1.1.1             ggrepel_0.9.0            
##  [85] cluster_2.1.0             DO.db_2.9                
##  [87] SparseM_1.78              triebeard_0.3.0          
##  [89] hms_1.0.0                 mime_0.9                 
##  [91] evaluate_0.14             xtable_1.8-4             
##  [93] XML_3.99-0.5              gridExtra_2.3            
##  [95] compiler_4.0.2            biomaRt_2.44.4           
##  [97] scater_1.16.2             tibble_3.0.4             
##  [99] crayon_1.3.4              htmltools_0.5.1.1        
## [101] GOstats_2.54.0            later_1.1.0.1            
## [103] tidyr_1.1.2               geneplotter_1.66.0       
## [105] DBI_1.1.0                 tweenr_1.0.1             
## [107] MASS_7.3-53               rappdirs_0.3.3           
## [109] Matrix_1.3-4              igraph_1.2.6             
## [111] pkgconfig_2.0.3           rvcheck_0.1.8            
## [113] registry_0.5-1            plotly_4.9.3             
## [115] xml2_1.3.2                foreach_1.5.1            
## [117] annotate_1.66.0           vipor_0.4.5              
## [119] dqrng_0.2.1               rngtools_1.5             
## [121] pkgmaker_0.32.2           webshot_0.5.2            
## [123] XVector_0.28.0            AnnotationForge_1.30.1   
## [125] stringr_1.4.0             digest_0.6.27            
## [127] graph_1.68.0              rmarkdown_2.6            
## [129] fastmatch_1.1-0           DelayedMatrixStats_1.10.1
## [131] GSEABase_1.52.1           curl_4.3                 
## [133] shiny_1.5.0               lifecycle_0.2.0          
## [135] jsonlite_1.7.2            BiocNeighbors_1.6.0      
## [137] viridisLite_0.3.0         askpass_1.1              
## [139] pillar_1.4.7              lattice_0.20-41          
## [141] fastmap_1.1.0             httr_1.4.2               
## [143] survival_3.2-7            GO.db_3.11.4             
## [145] glue_1.4.2                iterators_1.0.13         
## [147] bit_4.0.4                 Rgraphviz_2.32.0         
## [149] ggforce_0.3.2             stringi_1.5.3            
## [151] blob_1.2.1                DESeq2_1.28.1            
## [153] BiocSingular_1.4.0        memoise_1.1.0            
## [155] dplyr_1.0.2               irlba_2.3.3